########################
# 0. Data Cleaning #####
########################
rm(list = ls())

# created by Franziska Quoß
# franziska.quoss@gesis.org
# uploaded to make data cleaning procedures transparent - original data cannot be made public
# public scientific use file of the SEP can be downloaded here: https://dx.doi.org/10.23662/FORS-DS-1220-1

# load libraries
library(tidyverse)
library(rio)
library(naniar)
library(plyr)
library(sp)
library(raster)
library(ggpubr)
library(jtools)
library(marginaleffects)
library(grid)


# R version and session info:
sessionInfo()
# R version 4.4.0 (2024-04-24 ucrt)
# Platform: x86_64-w64-mingw32/x64
# Running under: Windows 10 x64 (build 19045)
# 
# Matrix products: default
# 
# locale:
#   [1] LC_COLLATE=German_Germany.utf8  LC_CTYPE=German_Germany.utf8   
# [3] LC_MONETARY=German_Germany.utf8 LC_NUMERIC=C                   
# [5] LC_TIME=German_Germany.utf8    
# 
# time zone: Europe/Berlin
# tzcode source: internal
# 
# attached base packages:
#   [1] grid      stats     graphics  grDevices utils     datasets  methods  
# [8] base
# 
# other attached packages:
#   [1] marginaleffects_0.20.1 jtools_2.2.2           ggpubr_0.6.0          
# [4] raster_3.6-26          sp_2.1-4               plyr_1.8.9            
# [7] naniar_1.1.0           rio_1.0.1              lubridate_1.9.3       
# [10] forcats_1.0.0          stringr_1.5.1          dplyr_1.1.4           
# [13] purrr_1.0.2            readr_2.1.5            tidyr_1.3.1           
# [16] tibble_3.2.1           ggplot2_3.5.1          tidyverse_2.0.0       
# 
# loaded via a namespace (and not attached):
#   [1] gtable_0.3.5       visdat_0.6.0       rstatix_0.7.2     
# [4] lattice_0.22-6     tzdb_0.4.0         vctrs_0.6.5       
# [7] tools_4.4.0        generics_0.1.3     fansi_1.0.6       
# [10] pkgconfig_2.0.3    R.oo_1.26.0        data.table_1.15.4 
# [13] gt_0.10.1          lifecycle_1.0.4    compiler_4.4.0    
# [16] munsell_0.5.1      agrmt_1.42.12      terra_1.7-71      
# [19] codetools_0.2-20   readstata13_0.10.1 carData_3.0-5     
# [22] htmltools_0.5.8.1  pillar_1.9.0       car_3.1-2         
# [25] crayon_1.5.2       R.utils_2.12.3     abind_1.4-5       
# [28] tidyselect_1.2.1   digest_0.6.35      stringi_1.8.4     
# [31] pander_0.6.5       fastmap_1.1.1      colorspace_2.1-0  
# [34] cli_3.6.2          magrittr_2.0.3     utf8_1.2.4        
# [37] broom_1.0.6        withr_3.0.0        scales_1.3.0      
# [40] backports_1.4.1    timechange_0.3.0   ggsignif_0.6.4    
# [43] R.methodsS3_1.8.2  hms_1.1.3          rlang_1.1.3       
# [46] Rcpp_1.0.12        glue_1.7.0         xml2_1.3.6        
# [49] rstudioapi_0.16.0  R6_2.5.1   


# import data ####
w4 <- import("do_not_upload/data/w4_small.dta")

# exclude all NA values, set to NA (takes a while)
w4 <- w4 %>% replace_with_na_all(condition = ~.x %in% c(-9, -8, -33, -44, -55, -66, -77, 
                                                        -88, -97, -99))

# rename relevant variables

# left-right: w4_q22
# table(w4$w4_q22, exclude = NULL)
# rescale to 0-10 to make comparable with Politician LiRe
w4$w4_q22_rescaled <- w4$w4_q22 - 1

# PID: w4_q23
table(w4$w4_q23, exclude = NULL)

# we only have 6 major parties, so for people who have a different PID, 
# this is NA


# table(w4$w4_cand1_partyid, w4$w4_cand1_partyname, exclude = NULL)
# get party to same numeric levels as the candidate data
w4$w4_q23_re <- NA
w4$w4_q23_re[w4$w4_q23==1] <- 3 # SVP
w4$w4_q23_re[w4$w4_q23==2] <- 4 # SP
w4$w4_q23_re[w4$w4_q23==3] <- 2 # FDP
w4$w4_q23_re[w4$w4_q23==4] <- 1 # CVP
w4$w4_q23_re[w4$w4_q23==5] <- 7 # BDP
w4$w4_q23_re[w4$w4_q23==6] <- 5 # GPS
w4$w4_q23_re[w4$w4_q23==7] <- 6 # glp
w4$w4_q23_re[w4$w4_q23==8] <- NA # EVP not on the ballots
w4$w4_q23_re[w4$w4_q23==9] <- NA  #others


# calculate Smartvote score individual ####

# recode values to 0, 2.5, 7.5, 10.0, set 5 to NA
w4$w4_q12x1_rec <- mapvalues(w4$w4_q12x1, c(1:4), c(0, 2.5, 7.5, 10.0))
w4$w4_q12x2_rec <- mapvalues(w4$w4_q12x2, c(1:4), c(0, 2.5, 7.5, 10.0))
w4$w4_q12x3_rec <- mapvalues(w4$w4_q12x3, c(1:4), c(10.0, 7.5, 2.5, 0))
w4$w4_q12x4_rec <- mapvalues(w4$w4_q12x4, c(1:4), c(10.0, 7.5, 2.5, 0))
w4$w4_q12x5_rec <- mapvalues(w4$w4_q12x5, c(1:4), c(0, 2.5, 7.5, 10.0))

env_items_ind <- c("w4_q12x1_rec", "w4_q12x2_rec", "w4_q12x3_rec", "w4_q12x4_rec", "w4_q12x5_rec")

w4 <- 
  w4 %>% 
  mutate(env_score_ego=rowMeans(.[ , env_items_ind], na.rm=TRUE))
w4$env_score_ego <- round(w4$env_score_ego, 4)

# check: 
# table(w4$env_score_ego, exclude = NULL)
# from 0-10
rm(env_items_ind)

# Experiment ####

# treatment variable: 
table(w4$w4_treat6, exclude = NULL)

# who participates in voting experiment:
# table(w4$srph_canton, is.na(w4$w4_cand1_id))
# respondents from cantons AI, AR, GL, NW, OW, UR do not participate

# 18 candidates:
table(w4$w4_q33x1, exclude = NULL)

# Exclude invalid answers (1.5)
# two respondents, both from online sample

w4$w4_q33x1[w4$PubId==23132] <- NA
w4$w4_q33x2[w4$PubId==23132] <- NA
w4$w4_q33x3[w4$PubId==23132] <- NA
w4$w4_q33x4[w4$PubId==23132] <- NA
w4$w4_q33x5[w4$PubId==23132] <- NA
w4$w4_q33x6[w4$PubId==23132] <- NA
w4$w4_q33x7[w4$PubId==23132] <- NA
w4$w4_q33x8[w4$PubId==23132] <- NA
w4$w4_q33x9[w4$PubId==23132] <- NA
w4$w4_q33x10[w4$PubId==23132] <- NA
w4$w4_q33x11[w4$PubId==23132] <- NA
w4$w4_q33x12[w4$PubId==23132] <- NA
w4$w4_q33x13[w4$PubId==23132] <- NA
w4$w4_q33x14[w4$PubId==23132] <- NA
w4$w4_q33x15[w4$PubId==23132] <- NA
w4$w4_q33x16[w4$PubId==23132] <- NA
w4$w4_q33x17[w4$PubId==23132] <- NA
w4$w4_q33x18[w4$PubId==23132] <- NA

w4$w4_q33x1[w4$PubId==12722] <- NA
w4$w4_q33x2[w4$PubId==12722] <- NA
w4$w4_q33x3[w4$PubId==12722] <- NA
w4$w4_q33x4[w4$PubId==12722] <- NA
w4$w4_q33x5[w4$PubId==12722] <- NA
w4$w4_q33x6[w4$PubId==12722] <- NA
w4$w4_q33x7[w4$PubId==12722] <- NA
w4$w4_q33x8[w4$PubId==12722] <- NA
w4$w4_q33x9[w4$PubId==12722] <- NA
w4$w4_q33x10[w4$PubId==12722] <- NA
w4$w4_q33x11[w4$PubId==12722] <- NA
w4$w4_q33x12[w4$PubId==12722] <- NA
w4$w4_q33x13[w4$PubId==12722] <- NA
w4$w4_q33x14[w4$PubId==12722] <- NA
w4$w4_q33x15[w4$PubId==12722] <- NA
w4$w4_q33x16[w4$PubId==12722] <- NA
w4$w4_q33x17[w4$PubId==12722] <- NA
w4$w4_q33x18[w4$PubId==12722] <- NA

w4$w4_q33x1[w4$PubId==28008] <- NA
w4$w4_q33x2[w4$PubId==28008] <- NA
w4$w4_q33x3[w4$PubId==28008] <- NA
w4$w4_q33x4[w4$PubId==28008] <- NA
w4$w4_q33x5[w4$PubId==28008] <- NA
w4$w4_q33x6[w4$PubId==28008] <- NA
w4$w4_q33x7[w4$PubId==28008] <- NA
w4$w4_q33x8[w4$PubId==28008] <- NA
w4$w4_q33x9[w4$PubId==28008] <- NA
w4$w4_q33x10[w4$PubId==28008] <- NA
w4$w4_q33x11[w4$PubId==28008] <- NA
w4$w4_q33x12[w4$PubId==28008] <- NA
w4$w4_q33x13[w4$PubId==28008] <- NA
w4$w4_q33x14[w4$PubId==28008] <- NA
w4$w4_q33x15[w4$PubId==28008] <- NA
w4$w4_q33x16[w4$PubId==28008] <- NA
w4$w4_q33x17[w4$PubId==28008] <- NA
w4$w4_q33x18[w4$PubId==28008] <- NA


# rurality of respondents ####
# not public data #
loc <- read.csv("../../data/coordinates.csv")
# keep PubId, lat, lng
loc <- loc[,c(5,1,2)]
# exclude cases with missings (15)
loc <- loc[complete.cases(loc),]

# turn into shapefile
names(loc) <- c("PubId", "lon", "lat")
xy <- loc[, c("lon", "lat")]

loc_spatial <-  SpatialPointsDataFrame(coords = xy,
                                       data = loc,
                                       proj4string = CRS("+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"))

# set to coordinates of rurality dataset (swiss coordinate system)
loc_spatial <- spTransform(loc_spatial, CRS("+proj=somerc +lat_0=46.9524055555556 +lon_0=7.43958333333333
+k_0=1 +x_0=2600000 +y_0=1200000 +ellps=bessel +units=m +no_defs"))

# cut points that are outside scope of Switzerland
CHE <- readRDS("data/gadm36_CHE_0_sp.rds") %>% spTransform(., CRS("+init=epsg:2056 +proj=somerc +lat_0=46.95240555555556 +lon_0=7.439583333333333 +k_0=1 +x_0=2600000 +y_0=1200000 +ellps=bessel +towgs84=674.374,15.056,405.346,0,0,0,0 +units=m +no_defs"))
loc_spatial <- crop(loc_spatial, extent(CHE)); rm(CHE)

# Attach Rurality From Bauzonenstatistik Schweiz (2017) ####
# can be downloaded from https://www.are.admin.ch/are/de/home/raumentwicklung-und-raumplanung/grundlagen-und-daten/bauzonenstatistik-schweiz.html

# Bauzonenstatistik Schweiz
shape_municipality_typology <- shapefile("data/MunicipalityTypology/ARE_GemTyp00_9.shp") %>% spTransform(., CRS("+init=epsg:2056"))

# table(shape_municipality_typology$TYP, exclude = NULL)
# barplot(table(shape_municipality_typology$TYP))
# spplot(shape_municipality_typology, "TYP")
# typology with 9 different values

#keep only relevant variable
shape_municipality_typology <- shape_municipality_typology[,"TYP"]

# rurality per respondent
municipality_per_id <- sp::over(loc_spatial, shape_municipality_typology)
# table(municipality_per_id)

#add to plz data (has ID)
loc_spatial@data <- cbind(loc_spatial@data, municipality_per_id)
names(loc_spatial@data)[4] <- "rurality"

# keep as normal df
rurality <- as.data.frame(loc_spatial)
rurality <- rurality[, c("PubId", "rurality")]

w4 <- left_join(w4, rurality, by="PubId")
#table(w4$rurality, exclude = NULL)
rm(rurality); rm(loc_spatial); rm(municipality_per_id); rm(loc); rm(shape_municipality_typology); rm(xy)

# add rurality of candidates ####
#  based on analysis of Smartvote data: https://www.smartvote.ch/de/

cand_rur <- import("data/all_candidates_rurality.dta")

# match on candidateid, firstname, lastname
cand_rur <- dplyr::rename(cand_rur, w4_cand1_rur = rurality)  
w4 <- left_join(w4, cand_rur, by = c("w4_cand1_id" = "CandidateId", "w4_cand1_firstname" = "FirstName", "w4_cand1_lastname" = "LastName"))

cand_rur <- dplyr::rename(cand_rur, w4_cand2_rur = w4_cand1_rur)  
w4 <- left_join(w4, cand_rur, by = c("w4_cand2_id" = "CandidateId", "w4_cand2_firstname" = "FirstName", "w4_cand2_lastname" = "LastName"))

cand_rur <- dplyr::rename(cand_rur, w4_cand3_rur = w4_cand2_rur)  
w4 <- left_join(w4, cand_rur, by = c("w4_cand3_id" = "CandidateId", "w4_cand3_firstname" = "FirstName", "w4_cand3_lastname" = "LastName"))

cand_rur <- dplyr::rename(cand_rur, w4_cand4_rur = w4_cand3_rur)  
w4 <- left_join(w4, cand_rur, by = c("w4_cand4_id" = "CandidateId", "w4_cand4_firstname" = "FirstName", "w4_cand4_lastname" = "LastName"))

cand_rur <- dplyr::rename(cand_rur, w4_cand5_rur = w4_cand4_rur)  
w4 <- left_join(w4, cand_rur, by = c("w4_cand5_id" = "CandidateId", "w4_cand5_firstname" = "FirstName", "w4_cand5_lastname" = "LastName"))

cand_rur <- dplyr::rename(cand_rur, w4_cand6_rur = w4_cand5_rur)  
w4 <- left_join(w4, cand_rur, by = c("w4_cand6_id" = "CandidateId", "w4_cand6_firstname" = "FirstName", "w4_cand6_lastname" = "LastName"))

cand_rur <- dplyr::rename(cand_rur, w4_cand7_rur = w4_cand6_rur)  
w4 <- left_join(w4, cand_rur, by = c("w4_cand7_id" = "CandidateId", "w4_cand7_firstname" = "FirstName", "w4_cand7_lastname" = "LastName"))

cand_rur <- dplyr::rename(cand_rur, w4_cand8_rur = w4_cand7_rur)  
w4 <- left_join(w4, cand_rur, by = c("w4_cand8_id" = "CandidateId", "w4_cand8_firstname" = "FirstName", "w4_cand8_lastname" = "LastName"))

cand_rur <- dplyr::rename(cand_rur, w4_cand9_rur = w4_cand8_rur)  
w4 <- left_join(w4, cand_rur, by = c("w4_cand9_id" = "CandidateId", "w4_cand9_firstname" = "FirstName", "w4_cand9_lastname" = "LastName"))

cand_rur <- dplyr::rename(cand_rur, w4_cand10_rur = w4_cand9_rur)  
w4 <- left_join(w4, cand_rur, by = c("w4_cand10_id" = "CandidateId", "w4_cand10_firstname" = "FirstName", "w4_cand10_lastname" = "LastName"))

cand_rur <- dplyr::rename(cand_rur, w4_cand11_rur = w4_cand10_rur)  
w4 <- left_join(w4, cand_rur, by = c("w4_cand11_id" = "CandidateId", "w4_cand11_firstname" = "FirstName", "w4_cand11_lastname" = "LastName"))

cand_rur <- dplyr::rename(cand_rur, w4_cand12_rur = w4_cand11_rur)  
w4 <- left_join(w4, cand_rur, by = c("w4_cand12_id" = "CandidateId", "w4_cand12_firstname" = "FirstName", "w4_cand12_lastname" = "LastName"))

cand_rur <- dplyr::rename(cand_rur, w4_cand13_rur = w4_cand12_rur)  
w4 <- left_join(w4, cand_rur, by = c("w4_cand13_id" = "CandidateId", "w4_cand13_firstname" = "FirstName", "w4_cand13_lastname" = "LastName"))

cand_rur <- dplyr::rename(cand_rur, w4_cand14_rur = w4_cand13_rur)  
w4 <- left_join(w4, cand_rur, by = c("w4_cand14_id" = "CandidateId", "w4_cand14_firstname" = "FirstName", "w4_cand14_lastname" = "LastName"))

cand_rur <- dplyr::rename(cand_rur, w4_cand15_rur = w4_cand14_rur)  
w4 <- left_join(w4, cand_rur, by = c("w4_cand15_id" = "CandidateId", "w4_cand15_firstname" = "FirstName", "w4_cand15_lastname" = "LastName"))

cand_rur <- dplyr::rename(cand_rur, w4_cand16_rur = w4_cand15_rur)  
w4 <- left_join(w4, cand_rur, by = c("w4_cand16_id" = "CandidateId", "w4_cand16_firstname" = "FirstName", "w4_cand16_lastname" = "LastName"))

cand_rur <- dplyr::rename(cand_rur, w4_cand17_rur = w4_cand16_rur)  
w4 <- left_join(w4, cand_rur, by = c("w4_cand17_id" = "CandidateId", "w4_cand17_firstname" = "FirstName", "w4_cand17_lastname" = "LastName"))

cand_rur <- dplyr::rename(cand_rur, w4_cand18_rur = w4_cand17_rur)  
w4 <- left_join(w4, cand_rur, by = c("w4_cand18_id" = "CandidateId", "w4_cand18_firstname" = "FirstName", "w4_cand18_lastname" = "LastName"))

rm(cand_rur)

# add gender of candidates ####
# based on analaysis of Smartvote data: https://www.smartvote.ch/de/

cand_sex <- import("data/candidates_selection.dta")
# match on candidateid, firstname, lastname
names(cand_sex)
cand_sex <- cand_sex[, c("gender", "firstname", "lastname")]
cand_sex$gender[cand_sex$gender== "Männer"] <- "2"
cand_sex$gender[cand_sex$gender== "Frauen"] <- "1"
cand_sex$gender <- as.numeric(cand_sex$gender)

duplicated(cand_sex[, 2:3]) %>% table(.)
# two people with exact same name: we can delete duplicates here as we are only interested in sex
cand_sex$dup <- duplicated(cand_sex[, 2:3]) %>% as.character()
cand_sex <- cand_sex[cand_sex$dup=="FALSE",]
cand_sex$dup <- NULL

cand_sex <- dplyr::rename(cand_sex, w4_cand1_gender = gender)  
w4 <- left_join(w4, cand_sex, by = c("w4_cand1_firstname" = "firstname", "w4_cand1_lastname" = "lastname"))

cand_sex <- dplyr::rename(cand_sex, w4_cand2_gender = w4_cand1_gender)  
w4 <- left_join(w4, cand_sex, by = c("w4_cand2_firstname" = "firstname", "w4_cand2_lastname" = "lastname"))

cand_sex <- dplyr::rename(cand_sex, w4_cand3_gender = w4_cand2_gender)  
w4 <- left_join(w4, cand_sex, by = c("w4_cand3_firstname" = "firstname", "w4_cand3_lastname" = "lastname"))

cand_sex <- dplyr::rename(cand_sex, w4_cand4_gender = w4_cand3_gender)  
w4 <- left_join(w4, cand_sex, by = c("w4_cand4_firstname" = "firstname", "w4_cand4_lastname" = "lastname"))

cand_sex <- dplyr::rename(cand_sex, w4_cand5_gender = w4_cand4_gender)  
w4 <- left_join(w4, cand_sex, by = c("w4_cand5_firstname" = "firstname", "w4_cand5_lastname" = "lastname"))

cand_sex <- dplyr::rename(cand_sex, w4_cand6_gender = w4_cand5_gender)  
w4 <- left_join(w4, cand_sex, by = c("w4_cand6_firstname" = "firstname", "w4_cand6_lastname" = "lastname"))

cand_sex <- dplyr::rename(cand_sex, w4_cand7_gender = w4_cand6_gender)  
w4 <- left_join(w4, cand_sex, by = c( "w4_cand7_firstname" = "firstname", "w4_cand7_lastname" = "lastname"))

cand_sex <- dplyr::rename(cand_sex, w4_cand8_gender = w4_cand7_gender)  
w4 <- left_join(w4, cand_sex, by = c("w4_cand8_firstname" = "firstname", "w4_cand8_lastname" = "lastname"))

cand_sex <- dplyr::rename(cand_sex, w4_cand9_gender = w4_cand8_gender)  
w4 <- left_join(w4, cand_sex, by = c("w4_cand9_firstname" = "firstname", "w4_cand9_lastname" = "lastname"))

cand_sex <- dplyr::rename(cand_sex, w4_cand10_gender = w4_cand9_gender)  
w4 <- left_join(w4, cand_sex, by = c("w4_cand10_firstname" = "firstname", "w4_cand10_lastname" = "lastname"))

cand_sex <- dplyr::rename(cand_sex, w4_cand11_gender = w4_cand10_gender)  
w4 <- left_join(w4, cand_sex, by = c("w4_cand11_firstname" = "firstname", "w4_cand11_lastname" = "lastname"))

cand_sex <- dplyr::rename(cand_sex, w4_cand12_gender = w4_cand11_gender)  
w4 <- left_join(w4, cand_sex, by = c("w4_cand12_firstname" = "firstname", "w4_cand12_lastname" = "lastname"))

cand_sex <- dplyr::rename(cand_sex, w4_cand13_gender = w4_cand12_gender)  
w4 <- left_join(w4, cand_sex, by = c("w4_cand13_firstname" = "firstname", "w4_cand13_lastname" = "lastname"))

cand_sex <- dplyr::rename(cand_sex, w4_cand14_gender = w4_cand13_gender)  
w4 <- left_join(w4, cand_sex, by = c("w4_cand14_firstname" = "firstname", "w4_cand14_lastname" = "lastname"))

cand_sex <- dplyr::rename(cand_sex, w4_cand15_gender = w4_cand14_gender)  
w4 <- left_join(w4, cand_sex, by = c("w4_cand15_firstname" = "firstname", "w4_cand15_lastname" = "lastname"))

cand_sex <- dplyr::rename(cand_sex, w4_cand16_gender = w4_cand15_gender)  
w4 <- left_join(w4, cand_sex, by = c("w4_cand16_firstname" = "firstname", "w4_cand16_lastname" = "lastname"))

cand_sex <- dplyr::rename(cand_sex, w4_cand17_gender = w4_cand16_gender)  
w4 <- left_join(w4, cand_sex, by = c("w4_cand17_firstname" = "firstname", "w4_cand17_lastname" = "lastname"))

cand_sex <- dplyr::rename(cand_sex, w4_cand18_gender = w4_cand17_gender)  
w4 <- left_join(w4, cand_sex, by = c("w4_cand18_firstname" = "firstname", "w4_cand18_lastname" = "lastname"))

rm(cand_sex)

# add plz of candidates ####
# based on analysis of smartvote data: https://www.smartvote.ch/de/
# based on this, we can find out their srph_municipality

cand_plz <- import("data/candidates_selection.dta")
# match on candidateid, firstname, lastname and age
names(cand_plz)
cand_plz <- cand_plz[, c("zip", "firstname", "lastname", "age")]
duplicated(cand_plz[, 2:4]) %>% table(.)
# one person is in the system twice: delete
cand_plz$dup <- duplicated(cand_plz[, 2:4]) %>% as.character()
cand_plz <- cand_plz[cand_plz$dup=="FALSE",]
cand_plz$dup <- NULL

# some candidates live abroad, we give them the plz 0000
cand_plz$zip[cand_plz$zip==-9] <- 0000

cand_plz <- dplyr::rename(cand_plz, w4_cand1_zip = zip)  
w4 <- left_join(w4, cand_plz, by = c("w4_cand1_firstname" = "firstname", "w4_cand1_lastname" = "lastname", "w4_cand1_age" = "age"))

cand_plz <- dplyr::rename(cand_plz, w4_cand2_zip = w4_cand1_zip)  
w4 <- left_join(w4, cand_plz, by = c("w4_cand2_firstname" = "firstname", "w4_cand2_lastname" = "lastname", "w4_cand2_age" = "age"))

cand_plz <- dplyr::rename(cand_plz, w4_cand3_zip = w4_cand2_zip)  
w4 <- left_join(w4, cand_plz, by = c("w4_cand3_firstname" = "firstname", "w4_cand3_lastname" = "lastname", "w4_cand3_age" = "age"))

cand_plz <- dplyr::rename(cand_plz, w4_cand4_zip = w4_cand3_zip)  
w4 <- left_join(w4, cand_plz, by = c("w4_cand4_firstname" = "firstname", "w4_cand4_lastname" = "lastname", "w4_cand4_age" = "age"))

cand_plz <- dplyr::rename(cand_plz, w4_cand5_zip = w4_cand4_zip)  
w4 <- left_join(w4, cand_plz, by = c("w4_cand5_firstname" = "firstname", "w4_cand5_lastname" = "lastname", "w4_cand5_age" = "age"))

cand_plz <- dplyr::rename(cand_plz, w4_cand6_zip = w4_cand5_zip)  
w4 <- left_join(w4, cand_plz, by = c("w4_cand6_firstname" = "firstname", "w4_cand6_lastname" = "lastname", "w4_cand6_age" = "age"))

cand_plz <- dplyr::rename(cand_plz, w4_cand7_zip = w4_cand6_zip)  
w4 <- left_join(w4, cand_plz, by = c("w4_cand7_firstname" = "firstname", "w4_cand7_lastname" = "lastname", "w4_cand7_age" = "age"))

cand_plz <- dplyr::rename(cand_plz, w4_cand8_zip = w4_cand7_zip)  
w4 <- left_join(w4, cand_plz, by = c("w4_cand8_firstname" = "firstname", "w4_cand8_lastname" = "lastname", "w4_cand8_age" = "age"))

cand_plz <- dplyr::rename(cand_plz, w4_cand9_zip = w4_cand8_zip)  
w4 <- left_join(w4, cand_plz, by = c("w4_cand9_firstname" = "firstname", "w4_cand9_lastname" = "lastname", "w4_cand9_age" = "age"))

cand_plz <- dplyr::rename(cand_plz, w4_cand10_zip = w4_cand9_zip)  
w4 <- left_join(w4, cand_plz, by = c("w4_cand10_firstname" = "firstname", "w4_cand10_lastname" = "lastname", "w4_cand10_age" = "age"))

cand_plz <- dplyr::rename(cand_plz, w4_cand11_zip = w4_cand10_zip)  
w4 <- left_join(w4, cand_plz, by = c("w4_cand11_firstname" = "firstname", "w4_cand11_lastname" = "lastname", "w4_cand11_age" = "age"))

cand_plz <- dplyr::rename(cand_plz, w4_cand12_zip = w4_cand11_zip)  
w4 <- left_join(w4, cand_plz, by = c("w4_cand12_firstname" = "firstname", "w4_cand12_lastname" = "lastname", "w4_cand12_age" = "age"))

cand_plz <- dplyr::rename(cand_plz, w4_cand13_zip = w4_cand12_zip)  
w4 <- left_join(w4, cand_plz, by = c("w4_cand13_firstname" = "firstname", "w4_cand13_lastname" = "lastname", "w4_cand13_age" = "age"))

cand_plz <- dplyr::rename(cand_plz, w4_cand14_zip = w4_cand13_zip)  
w4 <- left_join(w4, cand_plz, by = c("w4_cand14_firstname" = "firstname", "w4_cand14_lastname" = "lastname", "w4_cand14_age" = "age"))

cand_plz <- dplyr::rename(cand_plz, w4_cand15_zip = w4_cand14_zip)  
w4 <- left_join(w4, cand_plz, by = c("w4_cand15_firstname" = "firstname", "w4_cand15_lastname" = "lastname", "w4_cand15_age" = "age"))

cand_plz <- dplyr::rename(cand_plz, w4_cand16_zip = w4_cand15_zip)  
w4 <- left_join(w4, cand_plz, by = c("w4_cand16_firstname" = "firstname", "w4_cand16_lastname" = "lastname", "w4_cand16_age" = "age"))

cand_plz <- dplyr::rename(cand_plz, w4_cand17_zip = w4_cand16_zip)  
w4 <- left_join(w4, cand_plz, by = c("w4_cand17_firstname" = "firstname", "w4_cand17_lastname" = "lastname", "w4_cand17_age" = "age"))

cand_plz <- dplyr::rename(cand_plz, w4_cand18_zip = w4_cand17_zip)  
w4 <- left_join(w4, cand_plz, by = c("w4_cand18_firstname" = "firstname", "w4_cand18_lastname" = "lastname", "w4_cand18_age" = "age"))

rm(cand_plz)

# get corresponding srph_municipality for candidates ####
# https://www.bfs.admin.ch/bfs/de/home/grundlagen/agvch/gwr-korrespondenztabelle.html
# gives srph_municipality code for each plz

muni <- import("data/CH.csv")
muni$srph_municipal <- muni$GDENR
muni$plz <- muni$DPLZ4

muni <- muni[,c("srph_municipal", "plz")]
muni$dup <- duplicated(muni) %>% as.character()
muni <- muni[muni$dup=="FALSE",]
muni$dup <- NULL
# 30 plz are part of Zürich (one muni) 

# plz 8000 is not included in muni dataset, but is super common
muni <- rbind(muni, c(261, 8000))

muni <- muni %>%
             group_by(plz) %>%
             dplyr::summarise(
             municip_vec = paste(srph_municipal, collapse = ", "))
muni2 <- str_split(muni$municip_vec, ",", simplify = T, n = 8)  %>% as.data.frame()
muni <- cbind(muni, muni2); rm(muni2)
muni <- muni[,c(1,3:7)]
names(muni) <- c("plz", "w4_cand1_muni1", "w4_cand1_muni2", 
                 "w4_cand1_muni3", "w4_cand1_muni4", "w4_cand1_muni5")

# set NA to empty
muni$w4_cand1_muni1[is.na(muni$w4_cand1_muni1)] <- ""
muni$w4_cand1_muni2[is.na(muni$w4_cand1_muni2)] <- ""
muni$w4_cand1_muni3[is.na(muni$w4_cand1_muni3)] <- ""
muni$w4_cand1_muni4[is.na(muni$w4_cand1_muni4)] <- ""
muni$w4_cand1_muni5[is.na(muni$w4_cand1_muni5)] <- ""

# remove trailing or leading empty values
muni$w4_cand1_muni1 <- str_trim(muni$w4_cand1_muni1, side = c("both"))
muni$w4_cand1_muni2 <- str_trim(muni$w4_cand1_muni2, side = c("both"))
muni$w4_cand1_muni3 <- str_trim(muni$w4_cand1_muni3, side = c("both"))
muni$w4_cand1_muni4 <- str_trim(muni$w4_cand1_muni4, side = c("both"))
muni$w4_cand1_muni5 <- str_trim(muni$w4_cand1_muni5, side = c("both"))

muni <- dplyr::rename(muni, w4_cand1_zip = plz)  
w4 <- left_join(w4, muni)

muni <- dplyr::rename(muni, c(w4_cand2_zip = w4_cand1_zip,
                              w4_cand2_muni1 = w4_cand1_muni1, w4_cand2_muni2 = w4_cand1_muni2,
                              w4_cand2_muni3 = w4_cand1_muni3, w4_cand2_muni4 = w4_cand1_muni4,
                              w4_cand2_muni5 = w4_cand1_muni5)) 
w4 <- left_join(w4, muni)

muni <- dplyr::rename(muni, c(w4_cand3_zip = w4_cand2_zip,
                              w4_cand3_muni1 = w4_cand2_muni1, w4_cand3_muni2 = w4_cand2_muni2,
                              w4_cand3_muni3 = w4_cand2_muni3, w4_cand3_muni4 = w4_cand2_muni4,
                              w4_cand3_muni5 = w4_cand2_muni5)) 
w4 <- left_join(w4, muni)

muni <- dplyr::rename(muni, c(w4_cand4_zip = w4_cand3_zip,
                              w4_cand4_muni1 = w4_cand3_muni1, w4_cand4_muni2 = w4_cand3_muni2,
                              w4_cand4_muni3 = w4_cand3_muni3, w4_cand4_muni4 = w4_cand3_muni4,
                              w4_cand4_muni5 = w4_cand3_muni5)) 
w4 <- left_join(w4, muni)

muni <- dplyr::rename(muni, c(w4_cand5_zip = w4_cand4_zip,
                              w4_cand5_muni1 = w4_cand4_muni1, w4_cand5_muni2 = w4_cand4_muni2,
                              w4_cand5_muni3 = w4_cand4_muni3, w4_cand5_muni4 = w4_cand4_muni4,
                              w4_cand5_muni5 = w4_cand4_muni5)) 
w4 <- left_join(w4, muni)

muni <- dplyr::rename(muni, c(w4_cand6_zip = w4_cand5_zip,
                              w4_cand6_muni1 = w4_cand5_muni1, w4_cand6_muni2 = w4_cand5_muni2,
                              w4_cand6_muni3 = w4_cand5_muni3, w4_cand6_muni4 = w4_cand5_muni4,
                              w4_cand6_muni5 = w4_cand5_muni5)) 
w4 <- left_join(w4, muni)

muni <- dplyr::rename(muni, c(w4_cand7_zip = w4_cand6_zip,
                              w4_cand7_muni1 = w4_cand6_muni1, w4_cand7_muni2 = w4_cand6_muni2,
                              w4_cand7_muni3 = w4_cand6_muni3, w4_cand7_muni4 = w4_cand6_muni4,
                              w4_cand7_muni5 = w4_cand6_muni5)) 
w4 <- left_join(w4, muni)

muni <- dplyr::rename(muni, c(w4_cand8_zip = w4_cand7_zip,
                              w4_cand8_muni1 = w4_cand7_muni1, w4_cand8_muni2 = w4_cand7_muni2,
                              w4_cand8_muni3 = w4_cand7_muni3, w4_cand8_muni4 = w4_cand7_muni4,
                              w4_cand8_muni5 = w4_cand7_muni5)) 
w4 <- left_join(w4, muni)

muni <- dplyr::rename(muni, c(w4_cand9_zip = w4_cand8_zip,
                              w4_cand9_muni1 = w4_cand8_muni1, w4_cand9_muni2 = w4_cand8_muni2,
                              w4_cand9_muni3 = w4_cand8_muni3, w4_cand9_muni4 = w4_cand8_muni4,
                              w4_cand9_muni5 = w4_cand8_muni5)) 
w4 <- left_join(w4, muni)

muni <- dplyr::rename(muni, c(w4_cand10_zip = w4_cand9_zip,
                              w4_cand10_muni1 = w4_cand9_muni1, w4_cand10_muni2 = w4_cand9_muni2,
                              w4_cand10_muni3 = w4_cand9_muni3, w4_cand10_muni4 = w4_cand9_muni4,
                              w4_cand10_muni5 = w4_cand9_muni5)) 
w4 <- left_join(w4, muni)

muni <- dplyr::rename(muni, c(w4_cand11_zip = w4_cand10_zip,
                              w4_cand11_muni1 = w4_cand10_muni1, w4_cand11_muni2 = w4_cand10_muni2,
                              w4_cand11_muni3 = w4_cand10_muni3, w4_cand11_muni4 = w4_cand10_muni4,
                              w4_cand11_muni5 = w4_cand10_muni5)) 
w4 <- left_join(w4, muni)

muni <- dplyr::rename(muni, c(w4_cand12_zip = w4_cand11_zip,
                              w4_cand12_muni1 = w4_cand11_muni1, w4_cand12_muni2 = w4_cand11_muni2,
                              w4_cand12_muni3 = w4_cand11_muni3, w4_cand12_muni4 = w4_cand11_muni4,
                              w4_cand12_muni5 = w4_cand11_muni5)) 
w4 <- left_join(w4, muni)

muni <- dplyr::rename(muni, c(w4_cand13_zip = w4_cand12_zip,
                              w4_cand13_muni1 = w4_cand12_muni1, w4_cand13_muni2 = w4_cand12_muni2,
                              w4_cand13_muni3 = w4_cand12_muni3, w4_cand13_muni4 = w4_cand12_muni4,
                              w4_cand13_muni5 = w4_cand12_muni5)) 
w4 <- left_join(w4, muni)

muni <- dplyr::rename(muni, c(w4_cand14_zip = w4_cand13_zip,
                              w4_cand14_muni1 = w4_cand13_muni1, w4_cand14_muni2 = w4_cand13_muni2,
                              w4_cand14_muni3 = w4_cand13_muni3, w4_cand14_muni4 = w4_cand13_muni4,
                              w4_cand14_muni5 = w4_cand13_muni5)) 
w4 <- left_join(w4, muni)

muni <- dplyr::rename(muni, c(w4_cand15_zip = w4_cand14_zip,
                              w4_cand15_muni1 = w4_cand14_muni1, w4_cand15_muni2 = w4_cand14_muni2,
                              w4_cand15_muni3 = w4_cand14_muni3, w4_cand15_muni4 = w4_cand14_muni4,
                              w4_cand15_muni5 = w4_cand14_muni5)) 
w4 <- left_join(w4, muni)

muni <- dplyr::rename(muni, c(w4_cand16_zip = w4_cand15_zip,
                              w4_cand16_muni1 = w4_cand15_muni1, w4_cand16_muni2 = w4_cand15_muni2,
                              w4_cand16_muni3 = w4_cand15_muni3, w4_cand16_muni4 = w4_cand15_muni4,
                              w4_cand16_muni5 = w4_cand15_muni5)) 
w4 <- left_join(w4, muni)

muni <- dplyr::rename(muni, c(w4_cand17_zip = w4_cand16_zip,
                              w4_cand17_muni1 = w4_cand16_muni1, w4_cand17_muni2 = w4_cand16_muni2,
                              w4_cand17_muni3 = w4_cand16_muni3, w4_cand17_muni4 = w4_cand16_muni4,
                              w4_cand17_muni5 = w4_cand16_muni5)) 
w4 <- left_join(w4, muni)

muni <- dplyr::rename(muni, c(w4_cand18_zip = w4_cand17_zip,
                              w4_cand18_muni1 = w4_cand17_muni1, w4_cand18_muni2 = w4_cand17_muni2,
                              w4_cand18_muni3 = w4_cand17_muni3, w4_cand18_muni4 = w4_cand17_muni4,
                              w4_cand18_muni5 = w4_cand17_muni5)) 
w4 <- left_join(w4, muni)

# fill nas in municipality with 0000 for foreign candidates
start <- which(names(w4) == "w4_cand1_muni1")
end <- which(names(w4) == "w4_cand18_muni5")

w4[, start:end] <- w4[start:end] %>%
                         mutate_all(~replace_na(., "0000"))

## match variable

w4$w4_cand1_localitymatch <- 0
w4$w4_cand1_localitymatch[w4$w4_cand1_muni1==w4$srph_municipal] <- 1
w4$w4_cand1_localitymatch[w4$w4_cand1_muni2==w4$srph_municipal] <- 1
w4$w4_cand1_localitymatch[w4$w4_cand1_muni3==w4$srph_municipal] <- 1
w4$w4_cand1_localitymatch[w4$w4_cand1_muni4==w4$srph_municipal] <- 1
w4$w4_cand1_localitymatch[w4$w4_cand1_muni5==w4$srph_municipal] <- 1
#table(w4$w4_cand1_localitymatch, exclude = NULL)

w4$w4_cand2_localitymatch <- 0
w4$w4_cand2_localitymatch[w4$w4_cand2_muni1==w4$srph_municipal] <- 1
w4$w4_cand2_localitymatch[w4$w4_cand2_muni2==w4$srph_municipal] <- 1
w4$w4_cand2_localitymatch[w4$w4_cand2_muni3==w4$srph_municipal] <- 1
w4$w4_cand2_localitymatch[w4$w4_cand2_muni4==w4$srph_municipal] <- 1
w4$w4_cand2_localitymatch[w4$w4_cand2_muni5==w4$srph_municipal] <- 1
#table(w4$w4_cand2_localitymatch, exclude = NULL)

w4$w4_cand3_localitymatch <- 0
w4$w4_cand3_localitymatch[w4$w4_cand3_muni1==w4$srph_municipal] <- 1
w4$w4_cand3_localitymatch[w4$w4_cand3_muni2==w4$srph_municipal] <- 1
w4$w4_cand3_localitymatch[w4$w4_cand3_muni3==w4$srph_municipal] <- 1
w4$w4_cand3_localitymatch[w4$w4_cand3_muni4==w4$srph_municipal] <- 1
w4$w4_cand3_localitymatch[w4$w4_cand3_muni5==w4$srph_municipal] <- 1
#table(w4$w4_cand3_localitymatch, exclude = NULL)

w4$w4_cand4_localitymatch <- 0
w4$w4_cand4_localitymatch[w4$w4_cand4_muni1==w4$srph_municipal] <- 1
w4$w4_cand4_localitymatch[w4$w4_cand4_muni2==w4$srph_municipal] <- 1
w4$w4_cand4_localitymatch[w4$w4_cand4_muni3==w4$srph_municipal] <- 1
w4$w4_cand4_localitymatch[w4$w4_cand4_muni4==w4$srph_municipal] <- 1
w4$w4_cand4_localitymatch[w4$w4_cand4_muni5==w4$srph_municipal] <- 1
#table(w4$w4_cand4_localitymatch, exclude = NULL)

w4$w4_cand5_localitymatch <- 0
w4$w4_cand5_localitymatch[w4$w4_cand5_muni1==w4$srph_municipal] <- 1
w4$w4_cand5_localitymatch[w4$w4_cand5_muni2==w4$srph_municipal] <- 1
w4$w4_cand5_localitymatch[w4$w4_cand5_muni3==w4$srph_municipal] <- 1
w4$w4_cand5_localitymatch[w4$w4_cand5_muni4==w4$srph_municipal] <- 1
w4$w4_cand5_localitymatch[w4$w4_cand5_muni5==w4$srph_municipal] <- 1
#table(w4$w4_cand5_localitymatch, exclude = NULL)

w4$w4_cand6_localitymatch <- 0
w4$w4_cand6_localitymatch[w4$w4_cand6_muni1==w4$srph_municipal] <- 1
w4$w4_cand6_localitymatch[w4$w4_cand6_muni2==w4$srph_municipal] <- 1
w4$w4_cand6_localitymatch[w4$w4_cand6_muni3==w4$srph_municipal] <- 1
w4$w4_cand6_localitymatch[w4$w4_cand6_muni4==w4$srph_municipal] <- 1
w4$w4_cand6_localitymatch[w4$w4_cand6_muni5==w4$srph_municipal] <- 1
#table(w4$w4_cand6_localitymatch, exclude = NULL)

w4$w4_cand7_localitymatch <- 0
w4$w4_cand7_localitymatch[w4$w4_cand7_muni1==w4$srph_municipal] <- 1
w4$w4_cand7_localitymatch[w4$w4_cand7_muni2==w4$srph_municipal] <- 1
w4$w4_cand7_localitymatch[w4$w4_cand7_muni3==w4$srph_municipal] <- 1
w4$w4_cand7_localitymatch[w4$w4_cand7_muni4==w4$srph_municipal] <- 1
w4$w4_cand7_localitymatch[w4$w4_cand7_muni5==w4$srph_municipal] <- 1
#table(w4$w4_cand7_localitymatch, exclude = NULL)

w4$w4_cand8_localitymatch <- 0
w4$w4_cand8_localitymatch[w4$w4_cand8_muni1==w4$srph_municipal] <- 1
w4$w4_cand8_localitymatch[w4$w4_cand8_muni2==w4$srph_municipal] <- 1
w4$w4_cand8_localitymatch[w4$w4_cand8_muni3==w4$srph_municipal] <- 1
w4$w4_cand8_localitymatch[w4$w4_cand8_muni4==w4$srph_municipal] <- 1
w4$w4_cand8_localitymatch[w4$w4_cand8_muni5==w4$srph_municipal] <- 1
#table(w4$w4_cand8_localitymatch, exclude = NULL)

w4$w4_cand9_localitymatch <- 0
w4$w4_cand9_localitymatch[w4$w4_cand9_muni1==w4$srph_municipal] <- 1
w4$w4_cand9_localitymatch[w4$w4_cand9_muni2==w4$srph_municipal] <- 1
w4$w4_cand9_localitymatch[w4$w4_cand9_muni3==w4$srph_municipal] <- 1
w4$w4_cand9_localitymatch[w4$w4_cand9_muni4==w4$srph_municipal] <- 1
w4$w4_cand9_localitymatch[w4$w4_cand9_muni5==w4$srph_municipal] <- 1
#table(w4$w4_cand9_localitymatch, exclude = NULL)

w4$w4_cand10_localitymatch <- 0
w4$w4_cand10_localitymatch[w4$w4_cand10_muni1==w4$srph_municipal] <- 1
w4$w4_cand10_localitymatch[w4$w4_cand10_muni2==w4$srph_municipal] <- 1
w4$w4_cand10_localitymatch[w4$w4_cand10_muni3==w4$srph_municipal] <- 1
w4$w4_cand10_localitymatch[w4$w4_cand10_muni4==w4$srph_municipal] <- 1
w4$w4_cand10_localitymatch[w4$w4_cand10_muni5==w4$srph_municipal] <- 1
#table(w4$w4_cand10_localitymatch, exclude = NULL)

w4$w4_cand11_localitymatch <- 0
w4$w4_cand11_localitymatch[w4$w4_cand11_muni1==w4$srph_municipal] <- 1
w4$w4_cand11_localitymatch[w4$w4_cand11_muni2==w4$srph_municipal] <- 1
w4$w4_cand11_localitymatch[w4$w4_cand11_muni3==w4$srph_municipal] <- 1
w4$w4_cand11_localitymatch[w4$w4_cand11_muni4==w4$srph_municipal] <- 1
w4$w4_cand11_localitymatch[w4$w4_cand11_muni5==w4$srph_municipal] <- 1
#table(w4$w4_cand11_localitymatch, exclude = NULL)

w4$w4_cand12_localitymatch <- 0
w4$w4_cand12_localitymatch[w4$w4_cand12_muni1==w4$srph_municipal] <- 1
w4$w4_cand12_localitymatch[w4$w4_cand12_muni2==w4$srph_municipal] <- 1
w4$w4_cand12_localitymatch[w4$w4_cand12_muni3==w4$srph_municipal] <- 1
w4$w4_cand12_localitymatch[w4$w4_cand12_muni4==w4$srph_municipal] <- 1
w4$w4_cand12_localitymatch[w4$w4_cand12_muni5==w4$srph_municipal] <- 1
#table(w4$w4_cand12_localitymatch, exclude = NULL)

w4$w4_cand13_localitymatch <- 0
w4$w4_cand13_localitymatch[w4$w4_cand13_muni1==w4$srph_municipal] <- 1
w4$w4_cand13_localitymatch[w4$w4_cand13_muni2==w4$srph_municipal] <- 1
w4$w4_cand13_localitymatch[w4$w4_cand13_muni3==w4$srph_municipal] <- 1
w4$w4_cand13_localitymatch[w4$w4_cand13_muni4==w4$srph_municipal] <- 1
w4$w4_cand13_localitymatch[w4$w4_cand13_muni5==w4$srph_municipal] <- 1
#table(w4$w4_cand13_localitymatch, exclude = NULL)

w4$w4_cand14_localitymatch <- 0
w4$w4_cand14_localitymatch[w4$w4_cand14_muni1==w4$srph_municipal] <- 1
w4$w4_cand14_localitymatch[w4$w4_cand14_muni2==w4$srph_municipal] <- 1
w4$w4_cand14_localitymatch[w4$w4_cand14_muni3==w4$srph_municipal] <- 1
w4$w4_cand14_localitymatch[w4$w4_cand14_muni4==w4$srph_municipal] <- 1
w4$w4_cand14_localitymatch[w4$w4_cand14_muni5==w4$srph_municipal] <- 1
#table(w4$w4_cand14_localitymatch, exclude = NULL)

w4$w4_cand15_localitymatch <- 0
w4$w4_cand15_localitymatch[w4$w4_cand15_muni1==w4$srph_municipal] <- 1
w4$w4_cand15_localitymatch[w4$w4_cand15_muni2==w4$srph_municipal] <- 1
w4$w4_cand15_localitymatch[w4$w4_cand15_muni3==w4$srph_municipal] <- 1
w4$w4_cand15_localitymatch[w4$w4_cand15_muni4==w4$srph_municipal] <- 1
w4$w4_cand15_localitymatch[w4$w4_cand15_muni5==w4$srph_municipal] <- 1
#table(w4$w4_cand15_localitymatch, exclude = NULL)

w4$w4_cand16_localitymatch <- 0
w4$w4_cand16_localitymatch[w4$w4_cand16_muni1==w4$srph_municipal] <- 1
w4$w4_cand16_localitymatch[w4$w4_cand16_muni2==w4$srph_municipal] <- 1
w4$w4_cand16_localitymatch[w4$w4_cand16_muni3==w4$srph_municipal] <- 1
w4$w4_cand16_localitymatch[w4$w4_cand16_muni4==w4$srph_municipal] <- 1
w4$w4_cand16_localitymatch[w4$w4_cand16_muni5==w4$srph_municipal] <- 1
#table(w4$w4_cand16_localitymatch, exclude = NULL)

w4$w4_cand17_localitymatch <- 0
w4$w4_cand17_localitymatch[w4$w4_cand17_muni1==w4$srph_municipal] <- 1
w4$w4_cand17_localitymatch[w4$w4_cand17_muni2==w4$srph_municipal] <- 1
w4$w4_cand17_localitymatch[w4$w4_cand17_muni3==w4$srph_municipal] <- 1
w4$w4_cand17_localitymatch[w4$w4_cand17_muni4==w4$srph_municipal] <- 1
w4$w4_cand17_localitymatch[w4$w4_cand17_muni5==w4$srph_municipal] <- 1
#table(w4$w4_cand17_localitymatch, exclude = NULL)

w4$w4_cand18_localitymatch <- 0
w4$w4_cand18_localitymatch[w4$w4_cand18_muni1==w4$srph_municipal] <- 1
w4$w4_cand18_localitymatch[w4$w4_cand18_muni2==w4$srph_municipal] <- 1
w4$w4_cand18_localitymatch[w4$w4_cand18_muni3==w4$srph_municipal] <- 1
w4$w4_cand18_localitymatch[w4$w4_cand18_muni4==w4$srph_municipal] <- 1
w4$w4_cand18_localitymatch[w4$w4_cand18_muni5==w4$srph_municipal] <- 1
#table(w4$w4_cand18_localitymatch, exclude = NULL)


# Split Vote ####

# help variables: have respondents voted for at least one candidate from each party?
w4$party1_vote[(w4$w4_q33x1+w4$w4_q33x2+w4$w4_q33x3)>0] <- 1
w4$party1_vote[(w4$w4_q33x1+w4$w4_q33x2+w4$w4_q33x3)==0] <- 0
w4$party2_vote[(w4$w4_q33x4+w4$w4_q33x5+w4$w4_q33x6)>0] <- 1
w4$party2_vote[(w4$w4_q33x4+w4$w4_q33x5+w4$w4_q33x6)==0] <- 0
w4$party3_vote[(w4$w4_q33x7+w4$w4_q33x8+w4$w4_q33x9)>0] <- 1
w4$party3_vote[(w4$w4_q33x7+w4$w4_q33x8+w4$w4_q33x9)==0] <- 0
w4$party4_vote[(w4$w4_q33x10+w4$w4_q33x11+w4$w4_q33x12)>0] <- 1
w4$party4_vote[(w4$w4_q33x10+w4$w4_q33x11+w4$w4_q33x12)==0] <- 0
w4$party5_vote[(w4$w4_q33x13+w4$w4_q33x14+w4$w4_q33x15)>0] <- 1
w4$party5_vote[(w4$w4_q33x13+w4$w4_q33x14+w4$w4_q33x15)==0] <- 0
w4$party6_vote[(w4$w4_q33x16+w4$w4_q33x17+w4$w4_q33x18)>0] <- 1
w4$party6_vote[(w4$w4_q33x16+w4$w4_q33x17+w4$w4_q33x18)==0] <- 0

w4$ticket_splitting <- NA
w4$ticket_splitting[(w4$party1_vote+ w4$party2_vote + w4$party3_vote +
                       w4$party4_vote + w4$party5_vote + w4$party6_vote) > 1] <- 1
w4$ticket_splitting[(w4$party1_vote+ w4$party2_vote + w4$party3_vote +
                       w4$party4_vote + w4$party5_vote + w4$party6_vote) == 1] <- 0
table(w4$ticket_splitting) %>% prop.table() # 73.4% split votes

# SV score of pid party ####
# uses smartvote data: https://www.smartvote.ch/de/ 

cand <- import("data/candidates_NRSR_2019-10-14.dta")
names(cand)
# keep: NR candidates who are incumbents
table(cand$incumbent_NR, exclude = NULL) # 170 out of 200 candidates are in the data
cand <- cand[cand$incumbent_NR==1,]

party_means <- cand %>% group_by(party_REC) %>% dplyr::summarise(mean_sv = mean(cleavage_6, na.rm = T) / 10,
                                                                 mean_lr = mean(smartmap_x, na.rm = T) *10 %>% round(.,2))

w4$pid_SV <- NA
w4$pid_SV[w4$w4_q23_re == 1] <- 5.13 #CVP
w4$pid_SV[w4$w4_q23_re == 2] <- 4.29 #FDP
w4$pid_SV[w4$w4_q23_re == 3] <- 1.96 #SVP (and Lega)
w4$pid_SV[w4$w4_q23_re == 4] <- 9.48 #SP
w4$pid_SV[w4$w4_q23_re == 5] <- 9.82 #GPS 
w4$pid_SV[w4$w4_q23_re == 6] <- 8.82 #glp
w4$pid_SV[w4$w4_q23_re == 7] <- 6.62 #BDP

# LR score of pid party ####

w4$pid_LR <- NA
w4$pid_LR[w4$w4_q23_re == 1] <- 4.05 #CVP
w4$pid_LR[w4$w4_q23_re == 2] <- 5.11 #FDP
w4$pid_LR[w4$w4_q23_re == 3] <- 5.91 #SVP (and Lega)
w4$pid_LR[w4$w4_q23_re == 4] <- 0.73 #SP
w4$pid_LR[w4$w4_q23_re == 5] <- 0.51 #GPS
w4$pid_LR[w4$w4_q23_re == 6] <- 2.89 #glp
w4$pid_LR[w4$w4_q23_re == 7] <- 3.69 #BDP 

table(w4$pid_LR, w4$pid_SV) # strictly ordered.
rm(party_means)

# Alignment - Absolute Difference ####

# absolute difference between SVself and SVpolitician, LRself and LRpolitician

# structure dataset: one line per respondent, 
# columns for each candidate they saw,
# columns for difference in SV and LR with each candidate and self

# SV env: env_score_ego - w4_cand1_envscore
# negative values: candidate is greener.
# positive values: respondent is greener.

w4$w4_cand1_SV_diff_abs <- w4$env_score_ego - w4$w4_cand1_envscore
w4$w4_cand2_SV_diff_abs <- w4$env_score_ego - w4$w4_cand2_envscore
w4$w4_cand3_SV_diff_abs <- w4$env_score_ego - w4$w4_cand3_envscore
w4$w4_cand4_SV_diff_abs <- w4$env_score_ego - w4$w4_cand4_envscore
w4$w4_cand5_SV_diff_abs <- w4$env_score_ego - w4$w4_cand5_envscore
w4$w4_cand6_SV_diff_abs <- w4$env_score_ego - w4$w4_cand6_envscore
w4$w4_cand7_SV_diff_abs <- w4$env_score_ego - w4$w4_cand7_envscore
w4$w4_cand8_SV_diff_abs <- w4$env_score_ego - w4$w4_cand8_envscore
w4$w4_cand9_SV_diff_abs <- w4$env_score_ego - w4$w4_cand9_envscore
w4$w4_cand10_SV_diff_abs <- w4$env_score_ego - w4$w4_cand10_envscore
w4$w4_cand11_SV_diff_abs <- w4$env_score_ego - w4$w4_cand11_envscore
w4$w4_cand12_SV_diff_abs <- w4$env_score_ego - w4$w4_cand12_envscore
w4$w4_cand13_SV_diff_abs <- w4$env_score_ego - w4$w4_cand13_envscore
w4$w4_cand14_SV_diff_abs <- w4$env_score_ego - w4$w4_cand14_envscore
w4$w4_cand15_SV_diff_abs <- w4$env_score_ego - w4$w4_cand15_envscore
w4$w4_cand16_SV_diff_abs <- w4$env_score_ego - w4$w4_cand16_envscore
w4$w4_cand17_SV_diff_abs <- w4$env_score_ego - w4$w4_cand17_envscore
w4$w4_cand18_SV_diff_abs <- w4$env_score_ego - w4$w4_cand18_envscore


# LiRe: w4_q22_rescaled, w4_cand1_leftright

# negative values: candidate is more conservative
# positive values: respondent is more conservative

w4$w4_cand1_LR_diff_abs <- w4$w4_q22_rescaled - w4$w4_cand1_leftright
w4$w4_cand2_LR_diff_abs <- w4$w4_q22_rescaled - w4$w4_cand2_leftright
w4$w4_cand3_LR_diff_abs <- w4$w4_q22_rescaled - w4$w4_cand3_leftright
w4$w4_cand4_LR_diff_abs <- w4$w4_q22_rescaled - w4$w4_cand4_leftright
w4$w4_cand5_LR_diff_abs <- w4$w4_q22_rescaled - w4$w4_cand5_leftright
w4$w4_cand6_LR_diff_abs <- w4$w4_q22_rescaled - w4$w4_cand6_leftright
w4$w4_cand7_LR_diff_abs <- w4$w4_q22_rescaled - w4$w4_cand7_leftright
w4$w4_cand8_LR_diff_abs <- w4$w4_q22_rescaled - w4$w4_cand8_leftright
w4$w4_cand9_LR_diff_abs <- w4$w4_q22_rescaled - w4$w4_cand9_leftright
w4$w4_cand10_LR_diff_abs <- w4$w4_q22_rescaled - w4$w4_cand10_leftright
w4$w4_cand11_LR_diff_abs <- w4$w4_q22_rescaled - w4$w4_cand11_leftright
w4$w4_cand12_LR_diff_abs <- w4$w4_q22_rescaled - w4$w4_cand12_leftright
w4$w4_cand13_LR_diff_abs <- w4$w4_q22_rescaled - w4$w4_cand13_leftright
w4$w4_cand14_LR_diff_abs <- w4$w4_q22_rescaled - w4$w4_cand14_leftright
w4$w4_cand15_LR_diff_abs <- w4$w4_q22_rescaled - w4$w4_cand15_leftright
w4$w4_cand16_LR_diff_abs <- w4$w4_q22_rescaled - w4$w4_cand16_leftright
w4$w4_cand17_LR_diff_abs <- w4$w4_q22_rescaled - w4$w4_cand17_leftright
w4$w4_cand18_LR_diff_abs <- w4$w4_q22_rescaled - w4$w4_cand18_leftright

# Alignment - number of candidates chosen from PID party ####

# everyone who has a different party as PID than the ones we present is set to NA
# 648 respondents

# code PID alignment for each candidate

w4$w4_cand1_PID_align <- NA 
w4$w4_cand1_PID_align[w4$w4_q23_re!=w4$w4_cand1_partyid] <- 0
w4$w4_cand1_PID_align[w4$w4_q23_re==w4$w4_cand1_partyid] <- 1
# table(w4$w4_cand1_PID_align, exclude = NULL) # correct: as many as on the diagonal

w4$w4_cand2_PID_align <- NA 
w4$w4_cand2_PID_align[w4$w4_q23_re!=w4$w4_cand2_partyid] <- 0
w4$w4_cand2_PID_align[w4$w4_q23_re==w4$w4_cand2_partyid] <- 1

w4$w4_cand3_PID_align <- NA 
w4$w4_cand3_PID_align[w4$w4_q23_re!=w4$w4_cand3_partyid] <- 0
w4$w4_cand3_PID_align[w4$w4_q23_re==w4$w4_cand3_partyid] <- 1
# cand 1, 2,3 are from the same party: makes sense that they have the same number for this

w4$w4_cand4_PID_align <- NA 
w4$w4_cand4_PID_align[w4$w4_q23_re!=w4$w4_cand4_partyid] <- 0
w4$w4_cand4_PID_align[w4$w4_q23_re==w4$w4_cand4_partyid] <- 1
#table(w4$w4_cand4_PID_align, exclude = NULL) # correct: as many as on the diagonal

w4$w4_cand5_PID_align <- NA 
w4$w4_cand5_PID_align[w4$w4_q23_re!=w4$w4_cand5_partyid] <- 0
w4$w4_cand5_PID_align[w4$w4_q23_re==w4$w4_cand5_partyid] <- 1

w4$w4_cand6_PID_align <- NA 
w4$w4_cand6_PID_align[w4$w4_q23_re!=w4$w4_cand6_partyid] <- 0
w4$w4_cand6_PID_align[w4$w4_q23_re==w4$w4_cand6_partyid] <- 1

w4$w4_cand7_PID_align <- NA 
w4$w4_cand7_PID_align[w4$w4_q23_re!=w4$w4_cand7_partyid] <- 0
w4$w4_cand7_PID_align[w4$w4_q23_re==w4$w4_cand7_partyid] <- 1
# table(w4$w4_cand7_PID_align, exclude = NULL) 

w4$w4_cand8_PID_align <- NA 
w4$w4_cand8_PID_align[w4$w4_q23_re!=w4$w4_cand8_partyid] <- 0
w4$w4_cand8_PID_align[w4$w4_q23_re==w4$w4_cand8_partyid] <- 1

w4$w4_cand9_PID_align <- NA 
w4$w4_cand9_PID_align[w4$w4_q23_re!=w4$w4_cand9_partyid] <- 0
w4$w4_cand9_PID_align[w4$w4_q23_re==w4$w4_cand9_partyid] <- 1

w4$w4_cand10_PID_align <- NA 
w4$w4_cand10_PID_align[w4$w4_q23_re!=w4$w4_cand10_partyid] <- 0
w4$w4_cand10_PID_align[w4$w4_q23_re==w4$w4_cand10_partyid] <- 1
# table(w4$w4_cand10_PID_align, exclude = NULL) 

w4$w4_cand11_PID_align <- NA 
w4$w4_cand11_PID_align[w4$w4_q23_re!=w4$w4_cand11_partyid] <- 0
w4$w4_cand11_PID_align[w4$w4_q23_re==w4$w4_cand11_partyid] <- 1

w4$w4_cand12_PID_align <- NA 
w4$w4_cand12_PID_align[w4$w4_q23_re!=w4$w4_cand12_partyid] <- 0
w4$w4_cand12_PID_align[w4$w4_q23_re==w4$w4_cand12_partyid] <- 1

w4$w4_cand13_PID_align <- NA 
w4$w4_cand13_PID_align[w4$w4_q23_re!=w4$w4_cand13_partyid] <- 0
w4$w4_cand13_PID_align[w4$w4_q23_re==w4$w4_cand13_partyid] <- 1
# table(w4$w4_cand13_PID_align, exclude = NULL) 

w4$w4_cand14_PID_align <- NA 
w4$w4_cand14_PID_align[w4$w4_q23_re!=w4$w4_cand14_partyid] <- 0
w4$w4_cand14_PID_align[w4$w4_q23_re==w4$w4_cand14_partyid] <- 1

w4$w4_cand15_PID_align <- NA 
w4$w4_cand15_PID_align[w4$w4_q23_re!=w4$w4_cand15_partyid] <- 0
w4$w4_cand15_PID_align[w4$w4_q23_re==w4$w4_cand15_partyid] <- 1

w4$w4_cand16_PID_align <- NA 
w4$w4_cand16_PID_align[w4$w4_q23_re!=w4$w4_cand16_partyid] <- 0
w4$w4_cand16_PID_align[w4$w4_q23_re==w4$w4_cand16_partyid] <- 1
# table(w4$w4_cand16_PID_align, exclude = NULL) 

w4$w4_cand17_PID_align <- NA 
w4$w4_cand17_PID_align[w4$w4_q23_re!=w4$w4_cand17_partyid] <- 0
w4$w4_cand17_PID_align[w4$w4_q23_re==w4$w4_cand17_partyid] <- 1

w4$w4_cand18_PID_align <- NA 
w4$w4_cand18_PID_align[w4$w4_q23_re!=w4$w4_cand18_partyid] <- 0
w4$w4_cand18_PID_align[w4$w4_q23_re==w4$w4_cand18_partyid] <- 1

# sum of candidates chosen that align with own party id. (0-3)
w4$w4_PID_align_sum <- 0


# one vote for candidate 5, 2 votes for candidate 8

w4$w4_PID_align_sum <- ifelse(w4$w4_cand1_PID_align == 1, 
                              w4$w4_PID_align_sum + w4$w4_q33x1,
                              w4$w4_PID_align_sum)
w4$w4_PID_align_sum <- ifelse(w4$w4_cand2_PID_align == 1, 
                              w4$w4_PID_align_sum + w4$w4_q33x2,
                              w4$w4_PID_align_sum)
w4$w4_PID_align_sum <- ifelse(w4$w4_cand3_PID_align == 1, 
                              w4$w4_PID_align_sum + w4$w4_q33x3,
                              w4$w4_PID_align_sum)
w4$w4_PID_align_sum <- ifelse(w4$w4_cand4_PID_align == 1, 
                              w4$w4_PID_align_sum + w4$w4_q33x4,
                              w4$w4_PID_align_sum)
w4$w4_PID_align_sum <- ifelse(w4$w4_cand5_PID_align == 1, 
                              w4$w4_PID_align_sum + w4$w4_q33x5,
                              w4$w4_PID_align_sum)
w4$w4_PID_align_sum <- ifelse(w4$w4_cand6_PID_align == 1, 
                              w4$w4_PID_align_sum + w4$w4_q33x6,
                              w4$w4_PID_align_sum)
w4$w4_PID_align_sum <- ifelse(w4$w4_cand7_PID_align == 1, 
                              w4$w4_PID_align_sum + w4$w4_q33x7,
                              w4$w4_PID_align_sum)
w4$w4_PID_align_sum <- ifelse(w4$w4_cand8_PID_align == 1, 
                              w4$w4_PID_align_sum + w4$w4_q33x8,
                              w4$w4_PID_align_sum)
w4$w4_PID_align_sum <- ifelse(w4$w4_cand9_PID_align == 1, 
                              w4$w4_PID_align_sum + w4$w4_q33x9,
                              w4$w4_PID_align_sum)
w4$w4_PID_align_sum <- ifelse(w4$w4_cand10_PID_align == 1, 
                              w4$w4_PID_align_sum + w4$w4_q33x10,
                              w4$w4_PID_align_sum)
w4$w4_PID_align_sum <- ifelse(w4$w4_cand11_PID_align == 1, 
                              w4$w4_PID_align_sum + w4$w4_q33x11,
                              w4$w4_PID_align_sum)
w4$w4_PID_align_sum <- ifelse(w4$w4_cand12_PID_align == 1, 
                              w4$w4_PID_align_sum + w4$w4_q33x12,
                              w4$w4_PID_align_sum)
w4$w4_PID_align_sum <- ifelse(w4$w4_cand13_PID_align == 1, 
                              w4$w4_PID_align_sum + w4$w4_q33x13,
                              w4$w4_PID_align_sum)
w4$w4_PID_align_sum <- ifelse(w4$w4_cand14_PID_align == 1, 
                              w4$w4_PID_align_sum + w4$w4_q33x14,
                              w4$w4_PID_align_sum)
w4$w4_PID_align_sum <- ifelse(w4$w4_cand15_PID_align == 1, 
                              w4$w4_PID_align_sum + w4$w4_q33x15,
                              w4$w4_PID_align_sum)
w4$w4_PID_align_sum <- ifelse(w4$w4_cand16_PID_align == 1, 
                              w4$w4_PID_align_sum + w4$w4_q33x16,
                              w4$w4_PID_align_sum)
w4$w4_PID_align_sum <- ifelse(w4$w4_cand17_PID_align == 1, 
                              w4$w4_PID_align_sum + w4$w4_q33x17,
                              w4$w4_PID_align_sum)
w4$w4_PID_align_sum <- ifelse(w4$w4_cand18_PID_align == 1, 
                              w4$w4_PID_align_sum + w4$w4_q33x18,
                              w4$w4_PID_align_sum)

w4$w4_PID_align_sum[is.na(w4$w4_q23_re)] <- NA
#table(w4$w4_PID_align_sum, exclude = NULL)

# Alignment - giving vote(s) to the candidate closest to ego ####

#
# SV - environmental dimension #
# 
# for each respondent - who is the candidate closest to them
# lowest abs. difference between ego and cand.
# if it is multiple, they all get marked as the candidate as closest
w4 <- transform(w4, SV_diff_abs_min = pmin(abs(w4_cand1_SV_diff_abs), abs(w4_cand2_SV_diff_abs), abs(w4_cand3_SV_diff_abs),
                                           abs(w4_cand4_SV_diff_abs), abs(w4_cand5_SV_diff_abs), abs(w4_cand6_SV_diff_abs),
                                           abs(w4_cand7_SV_diff_abs), abs(w4_cand8_SV_diff_abs), abs(w4_cand9_SV_diff_abs),
                                           abs(w4_cand10_SV_diff_abs), abs(w4_cand11_SV_diff_abs), abs(w4_cand12_SV_diff_abs),
                                           abs(w4_cand13_SV_diff_abs), abs(w4_cand14_SV_diff_abs), abs(w4_cand15_SV_diff_abs),
                                           abs(w4_cand16_SV_diff_abs), abs(w4_cand17_SV_diff_abs), abs(w4_cand18_SV_diff_abs)))
# LR - left-right dimension #
# 
# for each respondent - who is the candidate closest to them
# lowest abs. difference between ego and cand.
# if it is multiple, they all get marked as the candidate as closest
w4 <- transform(w4, LR_diff_abs_min = pmin(abs(w4_cand1_LR_diff_abs), abs(w4_cand2_LR_diff_abs), abs(w4_cand3_LR_diff_abs),
                                           abs(w4_cand4_LR_diff_abs), abs(w4_cand5_LR_diff_abs), abs(w4_cand6_LR_diff_abs),
                                           abs(w4_cand7_LR_diff_abs), abs(w4_cand8_LR_diff_abs), abs(w4_cand9_LR_diff_abs),
                                           abs(w4_cand10_LR_diff_abs), abs(w4_cand11_LR_diff_abs), abs(w4_cand12_LR_diff_abs),
                                           abs(w4_cand13_LR_diff_abs), abs(w4_cand14_LR_diff_abs), abs(w4_cand15_LR_diff_abs),
                                           abs(w4_cand16_LR_diff_abs), abs(w4_cand17_LR_diff_abs), abs(w4_cand18_LR_diff_abs)))



# Exclude some respondents: ####

# WHO LIVE IN CANTONS THAT DID NOT PARTICIPATE IN THE EXPERIMENT
w4 <- w4[w4$srph_canton%in%c("AG", "BE", "BL", "BS",
                                               "FR", "GE", "GR", "JU", 
                                               "LU", "NE", "SG", "SH",
                                               "SO", "SZ", "TG", "TI",
                                               "VD", "VS", "ZG", "ZH"),]
# 621 respondents come from these cantons

# exclude respondents who are not eligible to vote
w4 <- w4[w4$w4_q24!=3,]

# WHO DID TAKE PART IN THE EXPERIMENT (NO RESPONSE)
w4$sum_exp <- w4$w4_q33x1+w4$w4_q33x2+w4$w4_q33x3+w4$w4_q33x4+w4$w4_q33x5+
w4$w4_q33x6+w4$w4_q33x7+w4$w4_q33x8+w4$w4_q33x9+w4$w4_q33x10+w4$w4_q33x11+w4$w4_q33x12+
w4$w4_q33x13+w4$w4_q33x14+w4$w4_q33x15+w4$w4_q33x16+w4$w4_q33x17+w4$w4_q33x18
#table(w4$sum_exp, exclude = NULL)

w4 <- w4[w4$sum_exp %in% c(1,2,3),]

stopifnot(nrow(w4) == 6268)
# final number of participants: 6268

# throw out variables which cannot be used in public dataset or are no longer used in analysis
w4 <- w4[ , -which(names(w4) %in% c("srph_canton", "srph_municipal"))]
w4 <- w4[ , -c(which(names(w4) %in% c("w4_cand1_zip")):which(names(w4) %in% c("w4_cand18_zip")))]
w4 <- w4[ , -c(which(names(w4) %in% c("w4_cand1_muni1")):which(names(w4) %in% c("w4_cand18_muni5")))]

# create artifical PubId for non-linkage to other public data
w4$PubId <- 1:nrow(w4)

# save data:
write.csv(w4, "w4_final.csv", row.names = F)
